#! /usr/bin/env python
#
# check_po - a gramps tool to check validity of po files
#
# Copyright (C) 2006-2006  Kees Bakker
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

#
# TODO
#
# * Check for HTML text in msgstr when there is none in msgid
# * Check for matching HTML tag/endtag in msgstr
#

import sys
import re
import os
import io
from argparse import ArgumentParser

all_total = {}
all_fuzzy = {}
all_untranslated = {}
all_percent_s = {}
all_named_s = {}
all_bnamed_s = {}
all_context = {}
all_coverage = {}
all_template_coverage = {}

def strip_quotes(st):
    if len(st) >= 2 and st[0] == '"' and st[len(st)-1] == '"':
        st = st.strip()[1:-1]
    return st

class CheckException( Exception ):
    pass

# This is a base class for all checks
class Check:
    def __init__( self ):
        self.msgs = []
    def diag( self ):
        if len( self.msgs ):
            print
            print(self.diag_header)
            for m in self.msgs:
                m.diag()
    def summary( self ):
        print("%-20s%d" % ( self.summary_text, len(self.msgs) ))

class Check_fmt( Check ):
    def __init__( self, fmt ):
        Check.__init__( self )
        self.diag_header = "-------- %s mismatches --------------" % fmt
        self.summary_text = "%s mismatches:" % fmt
        self.fmt = fmt

    def __process( self, msg, msgid, msgstr ):
        cnt1 = msgid.count( self.fmt )
        cnt2 = msgstr.count( self.fmt )
        if cnt1 != cnt2:
            self.msgs.append( msg )

    def process( self, msg ):
        msgid = msg.msgid
        msgstr = msg.msgstr[0]
        self.__process( msg, msgid, msgstr )

        if msg.msgidp and len(msg.msgstr) >= 2:
            msgid = msg.msgidp
            msgstr = msg.msgstr[1]
            self.__process( msg, msgid, msgstr )

class Check_named_fmt( Check ):
    # A pattern to find all %()
    find_named_fmt_pat = re.compile('% \( \w+ \) \d* \D', re.VERBOSE)

    def __init__( self ):
        Check.__init__( self )
        self.diag_header = "-------- %() name mismatches --------------"
        self.summary_text = "%() name mismatches:"

    def __process( self, msg, msgid, msgstr ):
        # Same number of named formats?
        fmts1 = self.find_named_fmt_pat.findall( msgid )
        fmts2 = self.find_named_fmt_pat.findall( msgstr )
        if len( fmts1 ) != len( fmts2 ):
            self.msgs.append( msg )
        else:
            # Do we have the same named formats?
            fmts1.sort()
            fmts2.sort()
            if fmts1 != fmts2:
                self.msgs.append( msg )

    def process( self, msg ):
        msgid = msg.msgid
        msgstr = msg.msgstr[0]
        self.__process( msg, msgid, msgstr )

        if msg.msgidp and len(msg.msgstr) >= 2:
            msgid = msg.msgidp
            msgstr = msg.msgstr[1]
            self.__process( msg, msgid, msgstr )

class Check_mapping_fmt( Check ):
    # A pattern to find all {}
    find_map_pat = re.compile('\{\w+\.?f?\[?.*?\]?\}',
                              re.UNICODE) # needed for Russian index letters

    def __init__( self ):
        Check.__init__( self )
        self.diag_header = "-------- {} name mismatches --------------"
        self.summary_text = "{} name mismatches:"

    def __process( self, msg, msgid, msgstr ):
        # Same number of named formats?
        fmts1 = self.find_map_pat.findall( msgid )
        fmts2 = self.find_map_pat.findall( msgstr )
        if len( fmts1 ) != len( fmts2 ):
            self.msgs.append( msg )
        else:
            # Do we have the same named formats?
            fmts1.sort()
            fmts2.sort()
            if fmts1 != fmts2:
                for idx in range(len(fmts2)):
                    fmts2[idx] = re.sub( r'\.f\[.+\]', '', fmts2[idx] )
                if fmts1 != fmts2:
                    self.msgs.append( msg )

    def process( self, msg ):
        msgid = msg.msgid
        msgstr = msg.msgstr[0]
        self.__process( msg, msgid, msgstr )

        if msg.msgidp and len(msg.msgstr) >= 2:
            msgid = msg.msgidp
            msgstr = msg.msgstr[1]
            self.__process( msg, msgid, msgstr )

class Check_missing_sd( Check ):
    # A pattern to find %() without s or d
    # Here is a command to use for testing
    # print(re.compile('% \( \w+ \) \d* (\D|$)', re.VERBOSE).findall( '%(event_name)s: %(place)s%(endnotes)s. ' ))
    find_named_fmt_pat2 = re.compile('% \( \w+ \) \d* (\D|$)', re.VERBOSE)

    def __init__( self ):
        Check.__init__( self )
        self.diag_header = "-------- %() without 's' or 'd' mismatches --------------"
        self.summary_text = "%() missing s/d:"
    def process( self, msg ):
        for msgstr in msg.msgstr:
            fmts = self.find_named_fmt_pat2.findall( msgstr )
            for f in fmts:
                if not f in ('s', 'd'):
                    self.msgs.append( msg )
                    break

class Check_runaway( Check ):
    def __init__( self ):
        Check.__init__( self )
        self.diag_header = "-------- Runaway context in translation ---------"
        self.summary_text = "Runaway context:"

    def __process( self, msg, msgid, msgstr ):
        # Runaway context. In the translated part we only want to see
        # the translation of the word after the |
        if (msgid.count('|') > 0 and msgstr.count('|') > 0
            and msgid != msgstr
            and not (' lexeme' in msgid) # _datestrings.py keyword
            and not ('alternative month names ' in msgid) # this one too
           ):
            self.msgs.append( msg )

    def process( self, msg ):
        msgid = msg.msgid
        msgstr = msg.msgstr[0]
        self.__process( msg, msgid, msgstr )

        if msg.msgidp and len(msg.msgstr) >= 2:
            msgid = msg.msgidp
            msgstr = msg.msgstr[1]
            self.__process( msg, msgid, msgstr )

class Check_xml_chars( Check ):
    # Special XML characters
    # It is not allowed to have a quote, an ampersand or an angle bracket
    xml_chars_pat = re.compile( r'<(?!(b>|/b>|i>|/i>|br/>)) | (?<=!(<b|/b|<i|/i|r/))> | " | & (?!(quot|nbsp|gt|amp);)', re.VERBOSE )

    def __init__( self ):
        Check.__init__( self )
        self.diag_header = "-------- unescaped XML special characters ---------"
        self.summary_text = "XML special chars:"

    def process( self, msg ):
        msgid = msg.msgid
        msgstr = msg.msgstr[0]

        # XML errors
        # Only look at messages in the tips.xml
        if msg.is_tips_xml:
            if self.xml_chars_pat.search( msgstr ):
                self.msgs.append( msg )

class Check_last_char( Check ):
    def __init__( self ):
        Check.__init__( self )
        self.diag_header = "-------- last character not identical ---------"
        self.summary_text = "Last character:"

    def __process( self, msg, msgid, msgstr ):
        msgid_last = msgid[-1:]
        msgstr_last = msgstr[-1:]
        if msgid_last.isspace() != msgstr_last.isspace():
            self.msgs.append( msg )
        elif (msgid_last == '.') != (msgstr_last == '.' or
                                     msgstr_last == '\u3002' ): # Chinese
            self.msgs.append( msg )
        elif (msgid_last == ':') != (msgstr_last == ':' or
                                     msgstr_last == '\uff1a' ): # Chinese
            self.msgs.append( msg )
        elif (msgid_last == ')') != (msgstr_last == ')' or
                                     msgstr_last == '\uff09' ): # Chinese
            self.msgs.append( msg )


    def process( self, msg ):
        # Last character of msgid? White space? Period?
        if msg.is_fuzzy:
            return
        msgid = msg.msgid
        msgstr = msg.msgstr[0]
        self.__process( msg, msgid, msgstr )

        if msg.msgidp and len(msg.msgstr) >= 2:
            msgid = msg.msgidp
            msgstr = msg.msgstr[1]
            self.__process( msg, msgid, msgstr )

class Check_shortcut_trans( Check ):
    def __init__( self ):
        Check.__init__( self )
        self.diag_header = "-------- shortcut key in translation ---------"
        self.summary_text = "Shortcut in msgstr:"

    def __process( self, msg, msgid, msgstr ):
        if msgid.count('_') == 0 and msgstr.count('_') > 0:
            self.msgs.append( msg )

    def process( self, msg ):
        msgid = msg.msgid
        msgstr = msg.msgstr[0]
        self.__process( msg, msgid, msgstr )

        if msg.msgidp and len(msg.msgstr) >= 2:
            msgid = msg.msgidp
            msgstr = msg.msgstr[1]
            self.__process( msg, msgid, msgstr )

class Msgid:
    fuzzy_pat = re.compile( 'fuzzy' )
    tips_xml_pat = re.compile( r'tips\.xml' )
    def __init__( self, msgnr, lineno ):
        self._msgid = [] # For debugging purpose the original text
        self._msgidp = [] # For debugging purpose the original text
        self._msgstr = [] # For debugging purpose the original text
        self.msgid = ''
        self.msgidp = ''
        self.msgstr = [] # This is a list to support plural
        self._cmnt = []
        self.nr = msgnr
        self.lineno = lineno
        self.is_fuzzy = 0
        self.is_tips_xml = 0

    def diag( self ):
        print
        print("msg nr: %d, lineno: %d%s" %
                 ( self.nr, self.lineno, self.is_fuzzy and " (fuzzy)" or "" ))
        sys.stdout.write( ''.join( self._msgid ) )
        sys.stdout.write( ''.join( self._msgidp ) )
        if sys.version_info[0] < 3:
            sys.stdout.write(''.join(self._msgstr).encode('utf-8'))
        else:
            sys.stdout.write( ''.join( self._msgstr ) )

    def add_msgid( self, line, lineno ):
        self._msgid.append( line )
        line = re.sub( r'msgid\s+', '', line )
        line = line.strip()
        if line[0] != '"' or line[-1:] != '"':
            print("ERROR at line %d: Missing quote." % lineno)
        line = strip_quotes( line )
        self.msgid += line

    def add_msgidp( self, line, lineno ):
        self._msgidp.append( line )
        line = re.sub( r'msgid_plural\s+', '', line )
        line = line.strip()
        if line[0] != '"' or line[-1:] != '"':
            print("ERROR at line %d: Missing quote." % lineno)
        line = strip_quotes( line )
        self.msgidp += line

    def add_new_msgstr( self, line, lineno ):
        self.msgstr.append( '' ) # Start a new msgstr
        self.add_msgstr( line, lineno )

    def add_msgstr( self, line, lineno ):
        self._msgstr.append( line )
        line = re.sub( r'msgstr(\[\d\])?\s+', '', line )
        line = line.strip()
        if line[0] != '"' or line[-1:] != '"':
            print("ERROR at line %d: Missing quote." % lineno)
        line = strip_quotes( line )
        self.msgstr[-1] += line

    def add_cmnt( self, line ):
        self._cmnt.append( line )
        if not self.is_fuzzy and self.fuzzy_pat.search( line ):
            self.is_fuzzy = 1
        if not self.is_tips_xml and self.tips_xml_pat.search( line ):
            self.is_tips_xml = 1

def create_new_Msgid( msgs, lineno ):
    msg = Msgid( len(msgs), lineno )
    msgs.append( msg )
    return msg

def read_msgs( fname ):
    empty_pat   = re.compile( r'^ \s* $',      re.VERBOSE )
    comment_pat = re.compile( r'\#',           re.VERBOSE )
    msgid_pat   = re.compile( r'msgid \s+ "',  re.VERBOSE )
    msgid_plural_pat = re.compile( r'msgid_plural \s+ "', re.VERBOSE )
    msgstr_pat  = re.compile( r'msgstr (\[\d\])? \s+ "', re.VERBOSE )
    str_pat     = re.compile( r'"',            re.VERBOSE )
    old_pat     = re.compile( r'\#~ \s+ ',     re.VERBOSE )

    f = io.open( fname, encoding='utf-8' )
    lines = f.readlines()

    # parse it like a statemachine
    NONE   = 'NONE' # Nothing detected, yet
    CMNT   = 'CMNT' # Inside comment part
    MSGID  = 'msgid' # Inside msgid part
    MSGIDP = 'msgid_plural' # Inside msgid_plural part
    MSGSTR = 'msgstr' # Inside msgstr part
    STR    = 'STR' # A continuation string
    OLD    = 'OLD' # An old pattern with #~

    global msgs
    state = NONE
    msg = None

    msgs = []
    for ix, line in enumerate( lines ):     # Use line numbers for messages
        lineno = ix + 1

        m = empty_pat.match( line )
        if m:
            continue # Empty lines are not interesting

        # What's the next state?
        if  old_pat.match( line ):
            next_state = OLD
        elif comment_pat.match( line ):
            next_state = CMNT
        elif msgid_pat.match( line ):
            next_state = MSGID
        elif msgid_plural_pat.match( line ):
            next_state = MSGIDP
        elif msgstr_pat.match( line ):
            next_state = MSGSTR
        elif str_pat.match( line ):
            next_state = STR
        else:
            print('WARNING: Unexpected input at %(fname)s:%(lineno)d' % vars())
            next_state = NONE

        #print("%(state)d->%(next_state)d\t%(line)s" % vars())
        if state == NONE:
            # expect msgid or comment or old stuff
            if next_state == CMNT:
                state = CMNT
                # Start with an empty new item
                msg = create_new_Msgid( msgs, lineno )
                msg.add_cmnt( line )

            elif next_state == MSGID:
                state = MSGID
                # Start with an empty new item
                msg = create_new_Msgid( msgs, lineno )
                msg.add_msgid( line, lineno )

            elif next_state == MSGIDP:
                raise CheckException( 'Unexpected %(next_state)s '
                                      'at %(fname)s:%(lineno)d' % vars() )

            elif next_state == MSGSTR:
                print('WARNING: Wild msgstr at %(fname)s:%(lineno)d' % vars())
                state = MSGSTR
                # Start with an empty new item
                msg = create_new_Msgid( msgs, lineno )
                msg.add_new_msgstr( line, lineno )

            elif next_state == STR:
                print('WARNING: Wild string at %(fname)s:%(lineno)d' % vars())

            elif next_state == OLD:
                pass    # Just skip

            else:
                raise CheckException( 'Unexpected state in po parsing '
                                      '(state = %(state)s)' % vars() )

        elif state == CMNT:
            # Expect more comment, or msgid.
            # If msgstr or string it is flagged as error.
            if next_state == CMNT:
                if msg:
                    msg.add_cmnt( line )
                else:
                    # Note. We may need to do something about these comments
                    # Skip for now
                    pass

            elif next_state == MSGID:
                state = MSGID
                if not msg:
                    # Start with an empty new item
                    msg = create_new_Msgid( msgs, lineno )
                msg.add_msgid( line, lineno )

            elif next_state == MSGIDP:
                raise CheckException( 'Unexpected %(next_state)s '
                                      'at %(fname)s:%(lineno)d' % vars() )

            elif next_state == MSGSTR:
                print('WARNING: Wild msgstr at %(fname)s:%(lineno)d' % vars())
                state = MSGSTR
                # Start with an empty new item
                msg = create_new_Msgid( msgs, lineno )
                msg.add_new_msgstr( line, lineno )

            elif next_state == STR:
                print('WARNING: Wild string at %(fname)s:%(lineno)d' % vars())

            elif next_state == OLD:
                msg = None
                pass    # Just skip

            else:
                raise CheckException( 'Unexpected state in po parsing '
                                      '(state = %(state)s)' % vars() )

        elif state == MSGID:
            # Expect msgstr or msgid_plural or string
            if next_state == CMNT:
                # Hmmm. A comment here?
                print('WARNING: Unexpted comment '
                      'at %(fname)s:%(lineno)d' % vars())

            elif next_state == MSGID:
                raise CheckException( 'Unexpected %(next_state)s '
                                      'at %(fname)s:%(lineno)d' % vars() )

            elif next_state == MSGIDP:
                state = MSGIDP
                msg.add_msgidp( line, lineno )

            elif next_state == MSGSTR:
                state = MSGSTR
                msg.add_new_msgstr( line, lineno )

            elif next_state == STR:
                # Continuation of msgid, stay in state MSGID
                msg.add_msgid( line, lineno )

            elif next_state == OLD:
                msg = None
                pass    # Just skip

            else:
                raise CheckException( 'Unexpected state in po parsing '
                                      '(state = %(state)s)' % vars() )

        elif state == MSGIDP:
            # Expect msgstr or string or comment
            if next_state == CMNT:
                # Hmmm. A comment here?
                print('WARNING: Unexpected comment '
                      'at %(fname)s:%(lineno)d' % vars())

            elif next_state == MSGID:
                raise CheckException( 'Unexpected %(next_state)s '
                                      'at %(fname)s:%(lineno)d' % vars() )

            elif next_state == MSGIDP:
                raise CheckException( 'Unexpected %(next_state)s '
                                      'at %(fname)s:%(lineno)d' % vars() )

            elif next_state == MSGSTR:
                state = MSGSTR
                msg.add_new_msgstr( line, lineno )

            elif next_state == STR:
                # Continuation of msgid_plural, stay in state MSGIDP
                msg.add_msgidp( line, lineno )

            elif next_state == OLD:
                msg = None
                pass    # Just skip

            else:
                raise CheckException( 'Unexpected state in po parsing '
                                      '(state = %(state)s)' % vars() )

        elif state == MSGSTR:
            # Expect comment, or msgid, or string.
            if next_state == CMNT:
                # A comment probably starts a new item
                state = CMNT
                msg = create_new_Msgid( msgs, lineno )
                msg.add_cmnt( line )

            elif next_state == MSGID:
                state = MSGID
                msg = create_new_Msgid( msgs, lineno )
                msg.add_msgid( line, lineno )

            elif next_state == MSGIDP:
                raise CheckException( 'Unexpected %(next_state)s '
                                      'at %(fname)s:%(lineno)d' % vars() )

            elif next_state == MSGSTR:
                # New msgstr, probably for plural form
                # Stay in MSGSTR state
                msg.add_new_msgstr( line, lineno )

            elif next_state == STR:
                msg.add_msgstr( line, lineno )

            elif next_state == OLD:
                msg = None
                pass    # Just skip

            else:
                raise CheckException( 'Unexpected state in po parsing '
                                      '(state = %(state)s)' % vars() )

        else:
            raise CheckException( 'Unexpected state in po parsing '
                                  '(state = %(state)s)' % vars() )

    # Strip items with just comments. (Can this happen?)
    msgs1 = []
    for m in msgs:
        if not m.msgid and not m.msgstr:
            #print("INFO: No msgid or msgstr at %s:%s" % ( fname, m.lineno ))
            pass
        else:
            msgs1.append( m )
    msgs = msgs1
    return msgs

def analyze_msgs( args, fname, msgs, nr_templates = None, nth = 0 ):
    nr_fuzzy = 0
    nr_untranslated = 0

    checks = []
    checks.append( Check_fmt( '%s' ) )
    checks.append( Check_fmt( '%d' ) )
    checks.append( Check_named_fmt() )
    checks.append( Check_mapping_fmt() )
    checks.append( Check_missing_sd() )
    checks.append( Check_runaway() )
    checks.append( Check_xml_chars() )
    checks.append( Check_last_char() )
    checks.append( Check_shortcut_trans() )

    for msg in msgs:
        msgid = msg.msgid
        msgstr = msg.msgstr
        #print
        #print("msgid: %(msgid)s" % vars())
        #print("msgstr: %(msgstr)s" % vars())

        if ''.join(msgstr) == '':
            nr_untranslated += 1
            continue

        if msg.is_fuzzy:
            nr_fuzzy += 1
            continue

        for c in checks:
            c.process( msg )

    nr_msgs = len(msgs)
    if nth > 0:
        print
        print("=====================================")
    print("%-20s%s"     % ( "File:",              fname ))
    print("%-20s%d"     % ( "Template total:",    nr_templates ))
    print("%-20s%d"     % ( "PO total:",          nr_msgs ))
    print("%-20s%d"     % ( "Fuzzy:",             nr_fuzzy ))
    print("%-20s%d"     % ( "Untranslated:",      nr_untranslated ))

    for c in checks:
        c.summary()

    po_coverage = (1.0 - (float(nr_untranslated) / float(nr_msgs))) * 100
    print("%-20s%5.2f%%" % ( "PO Coverage:",       po_coverage ))

    template_coverage = po_coverage * float(nr_msgs) / float(nr_templates)
    print("%-20s%5.2f%%" % ( "Template Coverage:", template_coverage ))

    not_displayed = nr_untranslated + nr_fuzzy
    translation = (1.0 - (float(not_displayed) / float(nr_templates))) * 100
    text = "%-20s%5.2f%%" % ( "Localized at:",     translation)

    if int(template_coverage*1000) == int(po_coverage*1000):
        print(text)
    else:
        print(text + ' (previous gramps.pot)')

    for c in checks:
        c.diag()

def main():

    parser = ArgumentParser(description='This program validates '
                                        'a PO file for GRAMPS.')

    parser.add_argument("-s", dest="summary",
                        choices=[file for file in os.listdir('.')
                            if file.endswith('.po')],
                        default=False,
                        help="the summary of check, "
                             "and if need, it gives details")

    args = parser.parse_args()

    if args.summary:
        files = sys.argv[2:]

        try:
            pot_msgs = read_msgs( 'gramps.pot' )
            nr_templates = len( pot_msgs )
            nth = 0
            for fname in files:
                msgs = read_msgs( fname )
                analyze_msgs( files, fname, msgs, nr_templates, nth )
                nth += 1

        except CheckException as e:
            print('Oops.', e)
            print('Bailing out')

if __name__ == "__main__":
    main()
